Dissolved Oxygen Saturation Report

get data across all programs, filter for analyte

library('here')
library('dplyr')
source(here("R/getAllData.R"))

# read from all source files
# full_df <- getAllData() %>%
#   mutate(
#     source = program,
#     site = Monitoring.Location.ID,
#     datetime = Activity.Start.Date.Time,
#     analyte = DEP.Analyte.Name,
#     value = DEP.Result.Value.Number,
#     units = DEP.Result.Unit,
#     latitude = Org.Decimal.Latitude,
#     longitude = Org.Decimal.Longitude,
#     sample_depth = Activity.Depth,
#     .keep = "none"
# )

# read from cached file produced by index.qmd
full_df <- read.csv(here("data", "exports", "allData.csv"))

df <- filter(full_df, analyte == params$analyte)

create .csv of analyte data

# save df to csv
# reduce to only cols we need & save to csv
df %>%
  write.csv(here("data", "exports", "unified-wq-db-samples", paste0(params$analyte, ".csv")))

`value` Distribution Across All Datasets

display histogram of values

library(ggplot2)
ggplot(df, aes(x = value)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  scale_y_log10() +
  labs(title = "Histogram of Values", x = "Value", y = "Log10(Count)")

Note that values are often not directly comparable due to the use of different units.

Show units used by each program

library(ggplot2)

ggplot(df, aes(x = source, fill = units)) +
  geom_bar() +
  scale_y_log10() +
  labs(
    title = "Count of Observations by Program and Units",
    x = "Program Name",
    y = "Count (Log Scale)",
    fill = "Units Reported"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Show values grouped by units

library(ggplot2)

ggplot(df, aes(x = units, y = value)) +
  geom_violin(width = 1.2, adjust = 10, fill = "skyblue", color = "black") +
  geom_boxplot(width = 0.1, outlier.shape = NA) +
  scale_y_log10() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "unit", y = "Log10(Value)", title = "Violin + Boxplot")

Values Reported By Program:

Show value distributions for each program

library(ggplot2)

ggplot(df, aes(x = source, y = value)) +
  geom_violin(width = 1.2, adjust = 10, fill = "skyblue", color = "black") +
  geom_boxplot(width = 0.1, outlier.shape = NA) +
  scale_y_log10() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Program", y = "Log10(Value)", title = "Violin + Boxplot")

Station Statistics:

create station statistics dataframe

source(here("R/seasonalMannKendall.R"))
library(lubridate)  # for mdy_hms()
library(pander)  # for display

# create table of samples for each station
samples_df <- df %>%
  # drop any with empty Monitoring.Location.ID
  filter(!is.na(site)) %>%
  # drop any with empty Activity.Start.Date.Time
  filter(!is.na(datetime)) %>%
  # parse the "YYYY-MM-DD" strings into POSIXct
  mutate(datetime = ymd(
          datetime,
          tz = "UTC")) %>%
  distinct()


# add statistics for each station
sample_stats_df <- samples_df %>%
  group_by(source, site) %>%
  reframe(
    {
      tmp <- seasonalMannKendallVectorized(
        datetime,
        value
      )
    },
    n_values = n(),
    mean = mean(value),
    min = min(value),
    max = max(value),
    coefficient.of.variation = sd(value) / mean(value)
  ) %>%
  mutate(
    # create column significant_slope
    significant_slope = ifelse(z <= 0.05, slope, NA_real_),
    pvalue = z
  ) %>%
  # drop unwanted columns added by seasonalMannKendall
  select(
    -z,
    -tau,
    -chi_square
  )



# print(head(sample_stats_df))
# # display sample_stats_df with pander
# pander(sample_stats_df)

save stats to csv

sample_stats_df %>%
  write.csv(here(
    "data", "exports", "seasonal-mann-kendall-stats", paste0(params$analyte, ".csv")))

Download data for this analyte’s statistics here

display with gt

library(gt)
library(scales)
library(tidyselect)  # for all_of()
library(RColorBrewer) # for brewer.pal()

# ── color_column() ─────────────────────────────────────────────────────────────
# gt_tbl   : a gt object that you’ve already created (e.g. `sample_stats_df %>% gt()`)
# df       : the original data.frame (must contain the column you want to color)
# column   : a string, e.g. "slope" or "n_values"
# palette  : a character vector of colours to feed to col_numeric()
#
color_column <- function(gt_tbl, df, column, 
                         palette = c("red", "orange", "yellow", "green", "blue", "violet"),
                         domain = NULL) {
  # 1) Pull out that column’s numeric values
  vals <- df[[column]]
  if (!is.numeric(vals)) {
    stop(sprintf("`%s` is not numeric; data_color() requires a numeric column.", column))
  }
  
  # 2) Compute its min and max (ignoring NA)
  min_val <- min(vals, na.rm = TRUE)
  max_val <- max(vals, na.rm = TRUE)
  if (is.null(domain)) {
    domain <- c(min_val, max_val)
  }
  
  # 3) Call data_color() on the gt table for that single column
  gt_tbl %>%
    data_color(
      columns = all_of(column),
      colors  = col_numeric(
        palette = palette,
        domain  = domain
      )
    )
}
library(dplyr)
library(gt)

# 1) First build your gt table as usual:
gt_tbl <- sample_stats_df %>% 
  gt()

# slope blue (-) to red (+) (0 centered)
tryCatch({
  min_slope <- min(sample_stats_df$slope,  na.rm = TRUE)
  max_slope <- max(sample_stats_df$slope,  na.rm = TRUE)
  max_abs_slope <- max(abs(min_slope), abs(max_slope))
  gt_tbl <- color_column(
    gt_tbl, 
    df     = sample_stats_df, 
    column = "slope",
    palette = rev(brewer.pal(11, "RdBu")),
    domain  = c(-max_abs_slope, max_abs_slope)
  )
}, error = function(e) {
  print("Error in slope color column")
  print(e)
})

tryCatch({
  # pvalue Z
  gt_tbl <- color_column(
    gt_tbl, 
    df      = sample_stats_df, 
    column  = "z", 
    palette = scales::brewer_pal(palette = "Blues")(9)
  )
}, error = function(e) {
  print("Error in z color column")
  print(e)
})

[1] "Error in z color column"
<simpleError in color_column(gt_tbl, df = sample_stats_df, column = "z", palette = (scales::brewer_pal(palette = "Blues"))(9)): `z` is not numeric; data_color() requires a numeric column.>

display with gt

# slope blue (-) to red (+) (0 centered)
tryCatch({
  min_slope <- min(sample_stats_df$significant_slope,  na.rm = TRUE)
  max_slope <- max(sample_stats_df$significant_slope,  na.rm = TRUE)
  max_abs_slope <- max(abs(min_slope), abs(max_slope))
  gt_tbl <- color_column(
    gt_tbl, 
    df     = sample_stats_df, 
    column = "significant_slope",
    palette = rev(brewer.pal(11, "RdBu")),
    domain  = c(-max_abs_slope, max_abs_slope)
  )
}, error = function(e) {
  print("Error in significant_slope color column")
  print(e)
})

[1] "Error in significant_slope color column"
<rlang_error in col_numeric(palette = palette, domain = domain): Wasn't able to determine range of `domain`>

display with gt

tryCatch({
  # mean values blue to red (0 centered)
  min_mean <- min(sample_stats_df$mean,  na.rm = TRUE)
  max_mean <- max(sample_stats_df$mean,  na.rm = TRUE)
  max_abs_mean <- max(abs(min_mean), abs(max_mean))
  gt_tbl <- color_column(
    gt_tbl, 
    df     = sample_stats_df, 
    column = "mean",
    palette = rev(brewer.pal(11, "RdBu")),
    domain  = c(-max_abs_mean, max_abs_mean)
  )
}, error = function(e) {
  print("Error in mean color column")
  print(e)
})

tryCatch({
  # n values white to green
  gt_tbl <- color_column(
    gt_tbl, 
    df      = sample_stats_df, 
    column  = "n_values", 
    palette = scales::brewer_pal(palette = "Greens")(9)
  )
}, error = function(e) {
  print("Error in n_values color column")
  print(e)
})

tryCatch({
  # min
  gt_tbl <- color_column(
    gt_tbl, 
    df      = sample_stats_df, 
    column  = "min", 
    palette = scales::brewer_pal(palette = "Blues")(9)
  )
}, error = function(e) {
  print("Error in min color column")
  print(e)
})

tryCatch({
  # max
  gt_tbl <- color_column(
    gt_tbl, 
    df      = sample_stats_df, 
    column  = "max", 
    palette = scales::brewer_pal(palette = "Blues")(9)
  )
}, error = function(e) {
  print("Error in max color column")
  print(e)
})

tryCatch({
  # coefficient.of.variation
  gt_tbl <- color_column(
    gt_tbl, 
    df      = sample_stats_df, 
    column  = "coefficient.of.variation", 
    palette = scales::brewer_pal(palette = "Blues")(9)
  )
}, error = function(e) {
  print("Error in coefficient.of.variation color column")
  print(e)
})

# 4) Render/display:
gt_tbl

source	site	slope	n_values	mean	min	max	coefficient.of.variation	significant_slope	pvalue
FIU_WQMP_HISTORICAL	203	NA	1	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	209	0.34247582	74	NA	NA	NA	NA	NA	0.33746767
FIU_WQMP_HISTORICAL	210	NA	1	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	212	NA	1	100.18294	100.18294	100.18294	NA	NA	NA
FIU_WQMP_HISTORICAL	214	NA	3	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	218	NA	1	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	223	NA	1	96.66486	96.66486	96.66486	NA	NA	NA
FIU_WQMP_HISTORICAL	224	NA	2	96.01678	95.18214	96.85143	0.012293324	NA	NA
FIU_WQMP_HISTORICAL	229	NA	1	87.25896	87.25896	87.25896	NA	NA	NA
FIU_WQMP_HISTORICAL	231	NA	1	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	232	NA	1	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	241	NA	1	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	243i	NA	1	84.10067	84.10067	84.10067	NA	NA	NA
FIU_WQMP_HISTORICAL	244	NA	4	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	253	NA	4	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	257	9.01799581	14	NA	NA	NA	NA	NA	1.00000000
FIU_WQMP_HISTORICAL	266	NA	3	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	282	NA	4	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	284	0.43401146	10	103.91575	76.87777	136.21289	0.175427631	NA	1.00000000
FIU_WQMP_HISTORICAL	285	NA	1	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	286	NA	1	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	287	NA	4	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	288	NA	2	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	289	NA	2	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	290	NA	1	96.23735	96.23735	96.23735	NA	NA	NA
FIU_WQMP_HISTORICAL	293	NA	1	94.93463	94.93463	94.93463	NA	NA	NA
FIU_WQMP_HISTORICAL	294	NA	3	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	295	NA	1	95.18155	95.18155	95.18155	NA	NA	NA
FIU_WQMP_HISTORICAL	296	0.53576814	8	NA	NA	NA	NA	NA	1.00000000
FIU_WQMP_HISTORICAL	297	NA	2	107.45668	98.35624	116.55712	0.119768916	NA	NA
FIU_WQMP_HISTORICAL	298	NA	1	105.33426	105.33426	105.33426	NA	NA	NA
FIU_WQMP_HISTORICAL	299	NA	1	59.74049	59.74049	59.74049	NA	NA	NA
FIU_WQMP_HISTORICAL	300	0.42435663	17	92.87637	68.51886	122.90485	0.156931019	NA	0.79843196
FIU_WQMP_HISTORICAL	301	0.55611383	66	NA	NA	NA	NA	NA	0.09171481
FIU_WQMP_HISTORICAL	307	NA	1	98.41701	98.41701	98.41701	NA	NA	NA
FIU_WQMP_HISTORICAL	309	NA	1	101.62781	101.62781	101.62781	NA	NA	NA
FIU_WQMP_HISTORICAL	310	0.49585380	78	NA	NA	NA	NA	NA	0.18675157
FIU_WQMP_HISTORICAL	311	0.48553710	64	NA	NA	NA	NA	NA	0.34212089
FIU_WQMP_HISTORICAL	312	NA	2	99.20678	97.91498	100.49858	0.018414913	NA	NA
FIU_WQMP_HISTORICAL	313	-0.01502767	65	NA	NA	NA	NA	NA	0.95574133
FIU_WQMP_HISTORICAL	316	NA	2	95.78017	93.59614	97.96420	0.032247664	NA	NA
FIU_WQMP_HISTORICAL	330	NA	3	97.69877	95.70935	98.91301	0.017777273	NA	NA
FIU_WQMP_HISTORICAL	337	NA	4	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	500	NA	2	95.72692	95.41264	96.04120	0.004642967	NA	NA
FIU_WQMP_HISTORICAL	501	0.25587242	12	NA	NA	NA	NA	NA	0.47950012
FIU_WQMP_HISTORICAL	502	-0.17008752	8	97.13241	93.63745	99.28568	0.020024825	NA	1.00000000
FIU_WQMP_HISTORICAL	503	-0.37170328	14	NA	NA	NA	NA	NA	1.00000000
FIU_WQMP_HISTORICAL	504	NA	2	97.16719	94.96974	99.36463	0.031982570	NA	NA
FIU_WQMP_HISTORICAL	505	NA	2	NA	NA	NA	NA	NA	NA
FIU_WQMP_HISTORICAL	507	NA	1	93.40655	93.40655	93.40655	NA	NA	NA
FIU_WQMP_HISTORICAL	508	2.72880742	13	89.08614	44.66796	107.23978	0.194585977	NA	0.21129955
FIU_WQMP_HISTORICAL	509	-0.08058467	15	95.42682	91.47474	98.55585	0.026962948	NA	0.59731157

value Distribution Across All Datasets

Values Reported By Program:

Station Statistics:

`value` Distribution Across All Datasets